library(statsr)
library(dplyr)
library(MASS)
library(BAS)
library(ggplot2)
library(devtools)
library(gridExtra)
library(grid)
library(GGally)
library(PreProcess)
load("ames_train.Rdata")
load("ames_test.Rdata")
load("ames_validation.Rdata")
nrow(ames_train)
## [1] 1000
nrow(ames_test)
## [1] 817
nrow(ames_validation)
## [1] 763
Let us first take a look into the structure and the dimension of the data set.
# Show the entire structure of the data set
ames_all <- rbind(ames_train, ames_test, ames_validation)
str(ames_all)
## tibble [2,580 × 81] (S3: tbl_df/tbl/data.frame)
## $ PID : int [1:2580] 909176150 905476230 911128020 535377150 534177230 908128060 902135020 528228540 923426010 908186050 ...
## $ area : int [1:2580] 856 1049 1001 1039 1665 1922 936 1246 889 1072 ...
## $ price : int [1:2580] 126000 139500 124900 114000 227000 198500 93000 187687 137500 140000 ...
## $ MS.SubClass : int [1:2580] 30 120 30 70 60 85 20 20 20 180 ...
## $ MS.Zoning : Factor w/ 7 levels "A (agr)","C (all)",..: 6 6 2 6 6 6 7 6 6 7 ...
## $ Lot.Frontage : int [1:2580] NA 42 60 80 70 64 60 53 74 35 ...
## $ Lot.Area : int [1:2580] 7890 4235 6060 8146 8400 7301 6000 3710 12395 3675 ...
## $ Street : Factor w/ 2 levels "Grvl","Pave": 2 2 2 2 2 2 2 2 2 2 ...
## $ Alley : Factor w/ 2 levels "Grvl","Pave": NA NA NA NA NA NA 2 NA NA NA ...
## $ Lot.Shape : Factor w/ 4 levels "IR1","IR2","IR3",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Land.Contour : Factor w/ 4 levels "Bnk","HLS","Low",..: 4 4 4 4 4 4 1 4 4 4 ...
## $ Utilities : Factor w/ 3 levels "AllPub","NoSeWa",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Lot.Config : Factor w/ 5 levels "Corner","CulDSac",..: 1 5 5 1 5 1 5 5 1 5 ...
## $ Land.Slope : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 2 1 1 1 ...
## $ Neighborhood : Factor w/ 28 levels "Blmngtn","Blueste",..: 26 8 12 21 20 8 21 1 15 8 ...
## $ Condition.1 : Factor w/ 9 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Condition.2 : Factor w/ 8 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Bldg.Type : Factor w/ 5 levels "1Fam","2fmCon",..: 1 5 1 1 1 1 2 1 1 5 ...
## $ House.Style : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 3 3 3 6 6 7 3 3 3 7 ...
## $ Overall.Qual : int [1:2580] 6 5 5 4 8 7 4 7 5 6 ...
## $ Overall.Cond : int [1:2580] 6 5 9 8 6 5 4 5 6 5 ...
## $ Year.Built : int [1:2580] 1939 1984 1930 1900 2001 2003 1953 2007 1984 2005 ...
## $ Year.Remod.Add : int [1:2580] 1950 1984 2007 2003 2001 2003 1953 2008 1984 2005 ...
## $ Roof.Style : Factor w/ 6 levels "Flat","Gable",..: 2 2 4 2 2 2 2 2 2 2 ...
## $ Roof.Matl : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Exterior.1st : Factor w/ 16 levels "AsbShng","AsphShn",..: 15 7 9 9 14 7 9 16 7 14 ...
## $ Exterior.2nd : Factor w/ 17 levels "AsbShng","AsphShn",..: 16 7 9 9 15 7 9 17 11 15 ...
## $ Mas.Vnr.Type : Factor w/ 6 levels "","BrkCmn","BrkFace",..: 5 3 5 5 5 3 5 3 5 6 ...
## $ Mas.Vnr.Area : int [1:2580] 0 149 0 0 0 500 0 20 0 76 ...
## $ Exter.Qual : Factor w/ 4 levels "Ex","Fa","Gd",..: 4 3 3 3 3 3 2 3 4 4 ...
## $ Exter.Cond : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 3 5 5 5 5 5 5 ...
## $ Foundation : Factor w/ 6 levels "BrkTil","CBlock",..: 2 2 1 1 3 4 2 3 2 3 ...
## $ Bsmt.Qual : Factor w/ 6 levels "","Ex","Fa","Gd",..: 6 4 6 3 4 NA 3 4 6 4 ...
## $ Bsmt.Cond : Factor w/ 6 levels "","Ex","Fa","Gd",..: 6 6 6 6 6 NA 6 6 6 6 ...
## $ Bsmt.Exposure : Factor w/ 5 levels "","Av","Gd","Mn",..: 5 4 5 5 5 NA 5 3 5 3 ...
## $ BsmtFin.Type.1 : Factor w/ 7 levels "","ALQ","BLQ",..: 6 4 2 7 4 NA 7 7 2 4 ...
## $ BsmtFin.SF.1 : int [1:2580] 238 552 737 0 643 0 0 0 647 467 ...
## $ BsmtFin.Type.2 : Factor w/ 7 levels "","ALQ","BLQ",..: 7 2 7 7 7 NA 7 7 7 7 ...
## $ BsmtFin.SF.2 : int [1:2580] 0 393 0 0 0 0 0 0 0 0 ...
## $ Bsmt.Unf.SF : int [1:2580] 618 104 100 405 167 0 936 1146 217 80 ...
## $ Total.Bsmt.SF : int [1:2580] 856 1049 837 405 810 0 936 1146 864 547 ...
## $ Heating : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Heating.QC : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 1 3 1 1 5 1 5 1 ...
## $ Central.Air : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 1 2 2 2 ...
## $ Electrical : Factor w/ 6 levels "","FuseA","FuseF",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ X1st.Flr.SF : int [1:2580] 856 1049 1001 717 810 495 936 1246 889 1072 ...
## $ X2nd.Flr.SF : int [1:2580] 0 0 0 322 855 1427 0 0 0 0 ...
## $ Low.Qual.Fin.SF: int [1:2580] 0 0 0 0 0 0 0 0 0 0 ...
## $ Bsmt.Full.Bath : int [1:2580] 1 1 0 0 1 0 0 0 0 1 ...
## $ Bsmt.Half.Bath : int [1:2580] 0 0 0 0 0 0 0 0 0 0 ...
## $ Full.Bath : int [1:2580] 1 2 1 1 2 3 1 2 1 1 ...
## $ Half.Bath : int [1:2580] 0 0 0 0 1 0 0 0 0 0 ...
## $ Bedroom.AbvGr : int [1:2580] 2 2 2 2 3 4 2 2 3 2 ...
## $ Kitchen.AbvGr : int [1:2580] 1 1 1 1 1 1 1 1 1 1 ...
## $ Kitchen.Qual : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 3 3 5 3 3 5 3 5 3 ...
## $ TotRms.AbvGrd : int [1:2580] 4 5 5 6 6 7 4 5 6 5 ...
## $ Functional : Factor w/ 8 levels "Maj1","Maj2",..: 8 8 8 8 8 8 4 8 8 8 ...
## $ Fireplaces : int [1:2580] 1 0 0 0 0 1 0 1 0 0 ...
## $ Fireplace.Qu : Factor w/ 5 levels "Ex","Fa","Gd",..: 3 NA NA NA NA 1 NA 3 NA NA ...
## $ Garage.Type : Factor w/ 6 levels "2Types","Attchd",..: 6 2 6 6 2 4 6 2 2 3 ...
## $ Garage.Yr.Blt : int [1:2580] 1939 1984 1930 1940 2001 2003 1974 2007 1984 2005 ...
## $ Garage.Finish : Factor w/ 4 levels "","Fin","RFn",..: 4 2 4 4 2 3 4 2 4 2 ...
## $ Garage.Cars : int [1:2580] 2 1 1 1 2 2 2 2 2 2 ...
## $ Garage.Area : int [1:2580] 399 266 216 281 528 672 576 428 484 525 ...
## $ Garage.Qual : Factor w/ 6 levels "","Ex","Fa","Gd",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ Garage.Cond : Factor w/ 6 levels "","Ex","Fa","Gd",..: 6 6 5 6 6 6 6 6 6 6 ...
## $ Paved.Drive : Factor w/ 3 levels "N","P","Y": 3 3 1 1 3 3 3 3 3 3 ...
## $ Wood.Deck.SF : int [1:2580] 0 0 154 0 0 0 0 100 0 0 ...
## $ Open.Porch.SF : int [1:2580] 0 105 0 0 45 0 32 24 0 44 ...
## $ Enclosed.Porch : int [1:2580] 0 0 42 168 0 177 112 0 0 0 ...
## $ X3Ssn.Porch : int [1:2580] 0 0 86 0 0 0 0 0 0 0 ...
## $ Screen.Porch : int [1:2580] 166 0 0 111 0 0 0 0 0 0 ...
## $ Pool.Area : int [1:2580] 0 0 0 0 0 0 0 0 0 0 ...
## $ Pool.QC : Factor w/ 4 levels "Ex","Fa","Gd",..: NA NA NA NA NA NA NA NA NA NA ...
## $ Fence : Factor w/ 4 levels "GdPrv","GdWo",..: NA NA NA NA NA NA NA NA NA NA ...
## $ Misc.Feature : Factor w/ 5 levels "Elev","Gar2",..: NA NA NA NA NA NA NA NA NA NA ...
## $ Misc.Val : int [1:2580] 0 0 0 0 0 0 0 0 0 0 ...
## $ Mo.Sold : int [1:2580] 3 2 11 5 11 7 2 3 4 5 ...
## $ Yr.Sold : int [1:2580] 2010 2009 2007 2009 2009 2009 2009 2008 2008 2007 ...
## $ Sale.Type : Factor w/ 10 levels "COD","Con","ConLD",..: 10 10 10 10 10 3 10 7 10 10 ...
## $ Sale.Condition : Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 5 5 5 5 6 5 5 ...
# Show the number of numeric and factor variables
nums <- unlist(lapply(ames_all, is.numeric))
fcts <- unlist(lapply(ames_all, is.factor))
cat(sum(nums), '\t', sum(fcts))
## 38 43
There are 2,580 rows with 80 columns in the data set (including ames_train, ames_test and ames_validation), of which 38 are numerical and 43 are categorical.
Response Variable
We first take a look into our response variable price:
# Function to plot Normal QQ
qqplot.data <- function (vec) # argument: vector of numbers
{
y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
x <- qnorm(c(0.25, 0.75))
slope <- diff(y)/diff(x)
int <- y[1L] - slope * x[1L]
d <- data.frame(resids = vec)
ggplot(d, aes(sample = resids)) + stat_qq() + geom_abline(slope = slope, intercept = int) + xlab("Theoretical Quantiles") + ylab("Sample Quantiles")
}
p1 <- ggplot(ames_all, aes(price)) + geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.75, color='blue', fill='blue') + geom_density()
p2 <- ggplot(ames_all, aes(log(price))) + geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.75, color='blue', fill='blue') + geom_density()
n1 <- qqplot.data(ames_all$price)
n2 <- qqplot.data(log(ames_all$price))
grid.arrange(
p1, p2, n1, n2,
nrow = 2,
bottom = textGrob(
"",
gp = gpar(fontface = 3, fontsize = 9),
hjust = 1,
x = 1
)
)
summary(ames_all$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12789 129975 159900 178060 209625 755000
summary(log(ames_all$price))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.456 11.775 11.982 12.013 12.253 13.534
library(moments)
cat('Skewness and kurtosis of price:\t\t', skewness(ames_all$price), '\t', kurtosis(ames_all$price), '\n')
## Skewness and kurtosis of price: 1.759778 8.419952
cat('Skewness and kurtosis of ln(price):\t', skewness(log(ames_all$price)), '\t', kurtosis(log(ames_all$price)), '\n')
## Skewness and kurtosis of ln(price): 0.04124497 4.393831
The distribution of price:
Completeness
Next, we evaluate the completeness of the data set and impute any missing values.
# Show all columns with NA values
NAcolumn <- which(colSums(is.na(ames_all)) > 0)
sort(colSums(sapply(ames_all[NAcolumn], is.na)), decreasing = TRUE)/nrow(ames_all)
## Pool.QC Misc.Feature Alley Fence Fireplace.Qu
## 0.9965116279 0.9624031008 0.9348837209 0.7965116279 0.4810077519
## Lot.Frontage Garage.Yr.Blt Garage.Qual Garage.Cond Garage.Type
## 0.1790697674 0.0500000000 0.0496124031 0.0496124031 0.0492248062
## Garage.Finish Bsmt.Qual Bsmt.Cond Bsmt.Exposure BsmtFin.Type.1
## 0.0492248062 0.0263565891 0.0263565891 0.0263565891 0.0263565891
## BsmtFin.Type.2 Mas.Vnr.Area Bsmt.Full.Bath Bsmt.Half.Bath BsmtFin.SF.1
## 0.0263565891 0.0054263566 0.0007751938 0.0007751938 0.0003875969
## BsmtFin.SF.2 Bsmt.Unf.SF Total.Bsmt.SF Garage.Cars Garage.Area
## 0.0003875969 0.0003875969 0.0003875969 0.0003875969 0.0003875969
Handling NA
library(forcats)
# For missing Garage.Yr.Blt, we provide Year.Built
ames_all$Garage.Yr.Blt[is.na(ames_all$Garage.Yr.Blt)] <- ames_all$Year.Built[is.na(ames_all$Garage.Yr.Blt)]
# For missing all other numerical variables, we provide 0
ames_all <- ames_all %>% mutate_if(is.numeric, ~replace(., is.na(.), 0))
# For missing all categorical variables, we provide value 'None'
ames_all <- ames_all %>% mutate_if(is.factor, fct_explicit_na, na_level = 'None')
# Drop a factor column with only 1 level
ames_all <- ames_all %>% dplyr::select(-Utilities) %>% dplyr::select(-Sale.Condition)
# Factorize a discrete int column
ames_all$MS.SubClass <- as.factor(ames_all$MS.SubClass)
Detailed EDA
We use random forest with 100 trees to quickly evaluate the important features.
# Run a quick random forest with 100 trees to find important features
library(randomForest)
set.seed(1)
model_rf <- randomForest(log(price) ~ . , data=ames_all[1:nrow(ames_train),], ntree=100, importance=TRUE)
imp_RF <- importance(model_rf)
imp_DF <- data.frame(Variables = row.names(imp_RF), MSE = imp_RF[,1])
imp_DF <- imp_DF[order(imp_DF$MSE, decreasing = TRUE),]
imp_var <- c(imp_DF[1:30,'Variables'])
ggplot(imp_DF[1:30,], aes(x=reorder(Variables, MSE), y=MSE, fill=MSE)) + geom_bar(stat = 'identity') + labs(x = 'Variables', y= '% increase in MSE if variable values are randomely shuffled (permuted)') + coord_flip() + theme(legend.position="none")
We proceed with our analysis using the top 30 important features that model_rf suggests.
ames_all <- ames_all %>% dplyr::select(price, all_of(imp_var))
num_variables <- dplyr::select_if(ames_all[1:nrow(ames_train),], is.numeric)
cat_variables <- dplyr::select_if(ames_all[1:nrow(ames_train),], is.factor)
We analyze the correlation between these fields with the response variable and within themselves. We look at the numerical variables first:
library(corrplot)
## corrplot 0.84 loaded
# Compute correlation between numerical values
correlationMatrix <- cor(num_variables)
# Sort on decreasing correlations with price
cor_sorted <- as.matrix(sort(correlationMatrix[, 'price'], decreasing = TRUE))
# Select only high correlations
CorHigh <- names(which(apply(cor_sorted, 1, function(x) abs(x)>0.5)))
correlationMatrix <- correlationMatrix[CorHigh, CorHigh]
corrplot.mixed(correlationMatrix, tl.col="black", tl.pos="lt")
The color-coded correlation matrix (above) shows numerical attributes that have absolute correlation strength greater than 0.5 to price. Overall.Qual shows strong correlation (>0.75) with price. Other features like area, Total.Bsmt.SF and Full.Bath show medium to strong correlation (0.5-0.75).
We exclude X1st.Flr.SF, Garage.Area, Garage.Yr.Blt and TotRms.AbvGrd due to multicollinearity (correlation strength larger than 0.75 with another independent variable).
ames_all <- ames_all %>%
dplyr::select(all_of(c(CorHigh)), colnames(cat_variables)) %>%
dplyr::select(-X1st.Flr.SF, -Garage.Area, -Garage.Yr.Blt, -TotRms.AbvGrd)
We note that the following combinations of variables exhibit absolute correlation strength (0.5-0.75): Overall.Qual & area, Overall.Qual & Total.Bsmt.SF, Overall.Qual & Garage.Cars, Overall.Qual & Year.Built, Overall.Qual & Year.Remod.Add, Overall.Qual & Full.Bath, area & Garage.Cars, area & Full.Bath, Garage.Cars & Year.Built, Year.Built & Year.Remod.Add.
# Numerical variables
names(dplyr::select_if(ames_all[1:nrow(ames_train),], is.numeric))
## [1] "price" "Overall.Qual" "area" "Total.Bsmt.SF"
## [5] "Garage.Cars" "Year.Built" "Year.Remod.Add" "Full.Bath"
## [9] "Fireplaces"
# Categorical variables
names(dplyr::select_if(ames_all[1:nrow(ames_train),], is.factor))
## [1] "Neighborhood" "MS.SubClass" "BsmtFin.Type.1" "Bsmt.Exposure"
## [5] "Kitchen.Qual" "Garage.Type" "Bsmt.Qual" "MS.Zoning"
## [9] "Fireplace.Qu" "Exterior.1st" "House.Style" "Central.Air"
p1 <- ggplot(aes(x=Overall.Qual, y=log(price)), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=loess, fill="red", color="red") + geom_smooth(method=lm, fill="blue", color="blue")
p2 <- ggplot(aes(x=area, y=log(price)), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=loess, fill="red", color="red") + geom_smooth(method=lm, fill="blue", color="blue")
p3 <- ggplot(aes(x=Total.Bsmt.SF, y=log(price)), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=loess, fill="red", color="red") + geom_smooth(method=lm, fill="blue", color="blue")
p4 <- ggplot(aes(x=Garage.Cars, y=log(price)), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=loess, fill="red", color="red") + geom_smooth(method=lm, fill="blue", color="blue")
p5 <- ggplot(aes(x=Year.Built, y=log(price)), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=loess, fill="red", color="red") + geom_smooth(method=lm, fill="blue", color="blue")
p6 <- ggplot(aes(x=Year.Remod.Add, y=log(price)), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=loess, fill="red", color="red") + geom_smooth(method=lm, fill="blue", color="blue")
p7 <- ggplot(aes(x=Full.Bath, y=log(price)), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=loess, fill="red", color="red") + geom_smooth(method=lm, fill="blue", color="blue")
p8 <- ggplot(aes(x=Fireplaces, y=log(price)), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=loess, fill="red", color="red") + geom_smooth(method=lm, fill="blue", color="blue")
# Create a grid of plots
grid.arrange(
p1, p2, p3, p4, p5, p6, p7, p8,
nrow = 3,
top = "log(price)",
bottom = textGrob(
"",
gp = gpar(fontface = 3, fontsize = 9),
hjust = 1,
x = 1
)
)
Next, we look into the categorical features. We analyze the distribution of each variables using box plots.
b1 <- ggplot(aes(x=reorder(Neighborhood, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + xlab("Neighborhood")
b2 <- ggplot(aes(x=reorder(MS.SubClass, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("MS.SubClass")
b3 <- ggplot(aes(x=reorder(BsmtFin.Type.1, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("BsmtFin.Type.1")
b4 <- ggplot(aes(x=reorder(Garage.Type, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("Garage.Type")
b5 <- ggplot(aes(x=reorder(Bsmt.Qual, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("Bsmt.Qual")
b6 <- ggplot(aes(x=reorder(Kitchen.Qual, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("Kitchen.Qual")
b7 <- ggplot(aes(x=reorder(Central.Air, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("Central.Air")
b8 <- ggplot(aes(x=reorder(Bsmt.Exposure, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("Bsmt.Exposure")
b9 <- ggplot(aes(x=reorder(Fireplace.Qu, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("Fireplace.Qu")
b10 <- ggplot(aes(x=reorder(MS.Zoning, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("MS.Zoning")
b11 <- ggplot(aes(x=reorder(Exterior.1st, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("Exterior.1st") + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
b12 <- ggplot(aes(x=reorder(House.Style, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("House.Style")
h1 <- ggplot(ames_all, aes(reorder(Neighborhood, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + xlab("Neighborhood") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h2 <- ggplot(ames_all, aes(x=reorder(MS.SubClass, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("MS.SubClass") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h3 <- ggplot(ames_all, aes(x=reorder(BsmtFin.Type.1, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("BsmtFin.Type.1") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h4 <- ggplot(ames_all, aes(x=reorder(Garage.Type, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("Garage.Type") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h5 <- ggplot(ames_all, aes(x=reorder(Bsmt.Qual, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("Bsmt.Qual") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h6 <- ggplot(ames_all, aes(x=reorder(Kitchen.Qual, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("Kitchen.Qual") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h7 <- ggplot(ames_all, aes(x=reorder(Central.Air, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("Central.Air") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h8 <- ggplot(ames_all, aes(x=reorder(Bsmt.Exposure, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("Bsmt.Exposure") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h9 <- ggplot(ames_all, aes(x=reorder(Fireplace.Qu, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("Fireplace.Qu") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h10 <- ggplot(ames_all, aes(x=reorder(MS.Zoning, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("MS.Zoning") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h11 <- ggplot(ames_all, aes(x=reorder(Exterior.1st, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("Exterior.1st") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3) + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
h12 <- ggplot(ames_all, aes(x=reorder(House.Style, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("House.Style") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
# Create a grid of plots
grid.arrange(
b1, h1, b2, h2, b3, h3, b4, h4, b5, h5, b6, h6, b7, h7, b8, h8, b9, h9, b10, h10, b11, h11, b12, h12,
nrow = 12,
bottom = textGrob(
"",
gp = gpar(fontface = 3, fontsize = 9),
hjust = 1,
x = 1
)
)
Distribution of the important categorical variables such as Neighborhood, MS.SubClass, Fireplace.Qu and etc. all exhibit clear group dependencies.
We run pairwise chi-square test for all combinations of categorical variables in ames_all to evaluate the strength of correlation.
# Run chi-square test for all combinations of categorical variables
library(plyr)
cat_df <- droplevels(cat_variables)
combos <- combn(ncol(cat_df), 2)
cat_corelations <- adply(combos, 2, function(x) {
column_one <- names(cat_df[, x[1]])
column_two <- names(cat_df[, x[2]])
mydata <- data.frame(cat_df[, x[1]], cat_df[, x[2]])
mytab <- table(mydata)
test <- chisq.test(mytab)
out <- data.frame('Column.A' = column_one,
'Column.B' = column_two,
'Chi.Square' = test$statistic,
'p.value' = test$p.value)
return(out)
})
cat_corelations %>% arrange(p.value)
## X1 Column.A Column.B Chi.Square p.value
## 1 20 MS.SubClass House.Style 4904.58739 0.000000e+00
## 2 25 BsmtFin.Type.1 Bsmt.Qual 2419.49701 0.000000e+00
## 3 33 Bsmt.Exposure Bsmt.Qual 1643.62391 0.000000e+00
## 4 22 BsmtFin.Type.1 Bsmt.Exposure 1632.67623 3.898178e-321
## 5 7 Neighborhood MS.Zoning 1675.62512 1.429146e-266
## 6 1 Neighborhood MS.SubClass 2113.30616 8.995274e-244
## 7 9 Neighborhood Exterior.1st 1551.70372 1.141377e-172
## 8 6 Neighborhood Bsmt.Qual 1074.37787 6.709832e-137
## 9 40 Kitchen.Qual Bsmt.Qual 606.46571 1.052008e-112
## 10 15 MS.SubClass Garage.Type 758.11043 4.269185e-109
## 11 4 Neighborhood Kitchen.Qual 740.79723 1.031665e-96
## 12 17 MS.SubClass MS.Zoning 557.65857 4.300817e-77
## 13 10 Neighborhood House.Style 741.90883 4.709411e-77
## 14 19 MS.SubClass Exterior.1st 695.53148 8.635330e-70
## 15 5 Neighborhood Garage.Type 692.96301 1.064479e-68
## 16 8 Neighborhood Fireplace.Qu 631.69723 6.163565e-67
## 17 2 Neighborhood BsmtFin.Type.1 723.45552 1.287607e-65
## 18 16 MS.SubClass Bsmt.Qual 485.74749 7.539032e-58
## 19 54 Bsmt.Qual Exterior.1st 414.18138 6.777544e-52
## 20 50 Garage.Type House.Style 311.76210 1.196497e-45
## 21 23 BsmtFin.Type.1 Kitchen.Qual 277.95727 5.606503e-43
## 22 12 MS.SubClass BsmtFin.Type.1 427.76070 9.481323e-43
## 23 42 Kitchen.Qual Fireplace.Qu 241.11458 7.030731e-40
## 24 52 Bsmt.Qual MS.Zoning 266.29166 1.055619e-39
## 25 46 Garage.Type Bsmt.Qual 275.67325 1.030660e-38
## 26 43 Kitchen.Qual Exterior.1st 290.31705 5.201018e-38
## 27 53 Bsmt.Qual Fireplace.Qu 245.68967 1.027544e-35
## 28 59 MS.Zoning House.Style 244.80707 1.519745e-35
## 29 51 Garage.Type Central.Air 171.84349 1.827871e-34
## 30 39 Kitchen.Qual Garage.Type 218.20485 3.006844e-33
## 31 13 MS.SubClass Bsmt.Exposure 309.77790 6.757078e-32
## 32 18 MS.SubClass Fireplace.Qu 305.76809 3.233467e-31
## 33 47 Garage.Type MS.Zoning 215.84774 5.154242e-30
## 34 21 MS.SubClass Central.Air 168.75901 1.219432e-28
## 35 14 MS.SubClass Kitchen.Qual 262.82890 1.558278e-28
## 36 37 Bsmt.Exposure House.Style 203.87167 9.319490e-28
## 37 28 BsmtFin.Type.1 Exterior.1st 296.06951 1.978451e-27
## 38 3 Neighborhood Bsmt.Exposure 380.94294 1.824458e-26
## 39 48 Garage.Type Fireplace.Qu 194.19349 6.006762e-26
## 40 49 Garage.Type Exterior.1st 250.29033 2.977990e-23
## 41 61 Fireplace.Qu Exterior.1st 225.93296 1.373295e-22
## 42 60 MS.Zoning Central.Air 107.36123 1.479014e-21
## 43 62 Fireplace.Qu House.Style 170.23248 1.552867e-21
## 44 29 BsmtFin.Type.1 House.Style 190.65112 7.933891e-21
## 45 55 Bsmt.Qual House.Style 178.01730 1.056784e-20
## 46 58 MS.Zoning Exterior.1st 214.03252 1.277635e-20
## 47 56 Bsmt.Qual Central.Air 104.95682 2.314172e-20
## 48 45 Kitchen.Qual Central.Air 97.93501 2.706262e-20
## 49 11 Neighborhood Central.Air 144.21609 2.382353e-18
## 50 64 Exterior.1st House.Style 217.17778 5.184934e-18
## 51 24 BsmtFin.Type.1 Garage.Type 165.47658 1.423255e-16
## 52 32 Bsmt.Exposure Garage.Type 138.58803 6.783282e-16
## 53 65 Exterior.1st Central.Air 91.17590 9.804165e-15
## 54 30 BsmtFin.Type.1 Central.Air 78.25601 3.122311e-14
## 55 44 Kitchen.Qual House.Style 115.30847 6.590662e-14
## 56 41 Kitchen.Qual MS.Zoning 105.03690 1.564068e-13
## 57 26 BsmtFin.Type.1 MS.Zoning 132.96944 2.451758e-13
## 58 27 BsmtFin.Type.1 Fireplace.Qu 128.25194 1.445633e-12
## 59 31 Bsmt.Exposure Kitchen.Qual 98.49227 2.342261e-12
## 60 38 Bsmt.Exposure Central.Air 52.50011 4.259689e-10
## 61 57 MS.Zoning Fireplace.Qu 80.86189 8.355346e-08
## 62 63 Fireplace.Qu Central.Air 32.51091 4.705818e-06
## 63 35 Bsmt.Exposure Fireplace.Qu 62.81626 4.231599e-05
## 64 34 Bsmt.Exposure MS.Zoning 58.57374 1.639107e-04
## 65 66 House.Style Central.Air 21.16085 1.716426e-03
## 66 36 Bsmt.Exposure Exterior.1st 84.65377 6.253605e-03
The output table shows that there are statistically significant dependencies between all categorical variables (p.value ~ 0).
area
We take the log transformation of the variable area as the distribution becomes more symmetric (histogram below), the relationship to log(price) becomes more linear (scatter plot below) and the R-squared for a simple linear regression on log(price) has a higher score after the transformation.
# Compare the distributions of area and log(area)
h1 <- ggplot(data=ames_all, aes(area)) + geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.75, color='blue', fill='blue') + geom_density()
h2 <- ggplot(data=ames_all, aes(log(area))) + geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.75, color='blue', fill='blue') + geom_density()
s1 <- ggplot(data=ames_all, aes(x=area, y=log(price))) + geom_point(alpha=0.75)+geom_smooth(method=lm, fill="blue", color="blue")
s2 <- ggplot(data=ames_all, aes(x=log(area), y=log(price))) + geom_point(alpha=0.75)+geom_smooth(method=lm, fill="blue", color="blue")
grid.arrange(
h1, h2, s1, s2,
nrow = 2,
bottom = textGrob(
"",
gp = gpar(fontface = 3, fontsize = 9),
hjust = 1,
x = 1
)
)
# Compare skewness and kurtosis of area and log(area)
cat('Skewness of area:\t', skewness(ames_all$area), '\n')
## Skewness of area: 0.9794948
cat('Skewness of ln(area):\t', skewness(log(ames_all$area)), '\n')
## Skewness of ln(area): -0.05829055
# Compare R squared of simple linear regression using area and log(area)
nor <- lm(log(price) ~ area, data=ames_all[1:nrow(ames_train),])
log <- lm(log(price) ~ log(area), data=ames_all[1:nrow(ames_train),])
cat('R squared for area:\t', summary(nor)$r.squared, '\n')
## R squared for area: 0.5109562
cat('R squared for ln(area):\t', summary(log)$r.squared, '\n')
## R squared for ln(area): 0.5465901
par(mfrow = c(2, 2))
# Residual analysis for area
plot(nor, which=c(2,5))
# Residual analysis for log(area)
plot(log, which=c(2,5))
ames_all <- ames_all %>% mutate(Log.area = log(area)) %>% dplyr::select(-area)
Total.Bsmt.SF
We will not take the log of Total.Bsmt.SF based on the following analysis.
# Compare the distributions of Total.Bsmt.SF and log(Total.Bsmt.SF)
h1 <- ggplot(data=ames_all, aes(Total.Bsmt.SF)) + geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.75, color='blue', fill='blue') + geom_density()
h2 <- ggplot(data=ames_all, aes(log(Total.Bsmt.SF+1))) + geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.75, color='blue', fill='blue') + geom_density()
s1 <- ggplot(data=ames_all, aes(x=Total.Bsmt.SF, y=log(price))) + geom_point(alpha=0.75)+geom_smooth(method=lm, fill="blue", color="blue")
s2 <- ggplot(data=ames_all, aes(x=log(Total.Bsmt.SF+1), y=log(price))) + geom_point(alpha=0.75)+geom_smooth(method=lm, fill="blue", color="blue")
grid.arrange(
h1, h2, s1, s2,
nrow = 2,
bottom = textGrob(
"",
gp = gpar(fontface = 3, fontsize = 9),
hjust = 1,
x = 1
)
)
# Compare skewness of Total.Bsmt.SF and log(Total.Bsmt.SF)
cat('Skewness of BsmtSF:\t', skewness(ames_all$Total.Bsmt.SF), '\n')
## Skewness of BsmtSF: 0.5101172
cat('Skewness of ln(BsmtSF):\t', skewness(log(ames_all$Total.Bsmt.SF)), '\n')
## Skewness of ln(BsmtSF): NaN
# Compare R squared of simple linear regression using Total.Bsmt.SF and log(Total.Bsmt.SF)
nor <- lm(log(price) ~ Total.Bsmt.SF, data=ames_all[1:nrow(ames_train),])
log <- lm(log(price) ~ log(Total.Bsmt.SF+1), data=ames_all[1:nrow(ames_train),])
cat('R squared for Total.Bsmt.SF:\t', summary(nor)$r.squared, '\n')
## R squared for Total.Bsmt.SF: 0.4476925
cat('R squared for ln(Total.Bsmt.SF):\t', summary(log)$r.squared, '\n')
## R squared for ln(Total.Bsmt.SF): 0.1520897
par(mfrow = c(2, 2))
# Residual analysis for area
plot(nor, which=c(2,5))
# Residual analysis for log(area)
plot(log, which=c(2,5))
Neighborhood
We reduce the cardinality of the column Neighborhood for a computational cost reason.
b1 <- ggplot(ames_all, aes(x = reorder(Neighborhood, log(price), fun = median), y = log(price))) + geom_boxplot(aes(alpha = 0.75, fill=Neighborhood), alpha=0.5) + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + xlab("Neighborhood")
ames_all$Neighborhood.Binned[ames_all$Neighborhood %in% c('NoRidge', 'NridgHt', 'StoneBr', 'GrnHill', 'Veenker', 'Somerst', 'Timber')] <- 3
ames_all$Neighborhood.Binned[ames_all$Neighborhood %in% c('ClearCr', 'CollgCr', 'Crawfor', 'Greens', 'Blmngtn', 'NWAmes', 'SawyerW')] <- 2
ames_all$Neighborhood.Binned[ames_all$Neighborhood %in% c('Gilbert', 'Mitchel', 'NPkVill', 'NAmes', 'Landmrk', 'SWISU', 'Sawyer')] <- 1
ames_all$Neighborhood.Binned[ames_all$Neighborhood %in% c('Blueste', 'BrkSide', 'Edwards', 'OldTown', 'IDOTRR', 'BrDale', 'MeadowV')] <- 0
ames_all$Neighborhood.Binned <- as.factor(ames_all$Neighborhood.Binned)
b2 <- ggplot(ames_all, aes(x = reorder(Neighborhood.Binned, log(price), fun = median), y = log(price))) + geom_boxplot(aes(alpha = 0.75, fill=Neighborhood.Binned), alpha=0.5) + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + xlab("Neighborhood.Binned")
grid.arrange(
b1, b2,
nrow = 2,
bottom = textGrob(
"",
gp = gpar(fontface = 3, fontsize = 9),
hjust = 1,
x = 1
)
)
ames_all <- ames_all %>% dplyr::select(-Neighborhood)
MS.SubClass
Reduce the cardinality of the column MS.SubClass.
b1 <- ggplot(ames_all, aes(x = reorder(MS.SubClass, log(price), fun = median), y = log(price))) + geom_boxplot(aes(alpha = 0.75, fill=MS.SubClass), alpha=0.5) + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + xlab("MS.SubClass")
ames_all$MS.SubClass.Binned[ames_all$MS.SubClass %in% c('60', '120', '80', '75')] <- 3
ames_all$MS.SubClass.Binned[ames_all$MS.SubClass %in% c('20', '85', '150', '70')] <- 2
ames_all$MS.SubClass.Binned[ames_all$MS.SubClass %in% c('90', '160', '40', '50')] <- 1
ames_all$MS.SubClass.Binned[ames_all$MS.SubClass %in% c('190', '45', '30', '180')] <- 0
ames_all$MS.SubClass.Binned <- as.factor(ames_all$MS.SubClass.Binned)
b2 <- ggplot(ames_all, aes(x = reorder(MS.SubClass.Binned, log(price), fun = median), y = log(price))) + geom_boxplot(aes(alpha = 0.75, fill=MS.SubClass.Binned), alpha=0.5) + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + xlab("MS.SubClass.Binned")
grid.arrange(
b1, b2,
nrow = 2,
bottom = textGrob(
"",
gp = gpar(fontface = 3, fontsize = 9),
hjust = 1,
x = 1
)
)
ames_all <- ames_all %>% dplyr::select(-MS.SubClass)
Unimportant Variables
ames_train %>% dplyr::select(-price, -all_of(imp_var)) %>% colnames()
## [1] "Lot.Frontage" "Street" "Alley" "Lot.Shape"
## [5] "Land.Contour" "Utilities" "Lot.Config" "Land.Slope"
## [9] "Condition.1" "Condition.2" "Bldg.Type" "Overall.Cond"
## [13] "Roof.Style" "Roof.Matl" "Exterior.2nd" "Mas.Vnr.Type"
## [17] "Mas.Vnr.Area" "Exter.Qual" "Exter.Cond" "Foundation"
## [21] "Bsmt.Cond" "BsmtFin.Type.2" "BsmtFin.SF.2" "Heating"
## [25] "Heating.QC" "Electrical" "Low.Qual.Fin.SF" "Bsmt.Half.Bath"
## [29] "Half.Bath" "Bedroom.AbvGr" "Kitchen.AbvGr" "Functional"
## [33] "Garage.Finish" "Garage.Qual" "Garage.Cond" "Paved.Drive"
## [37] "Wood.Deck.SF" "Open.Porch.SF" "Enclosed.Porch" "X3Ssn.Porch"
## [41] "Screen.Porch" "Pool.Area" "Pool.QC" "Fence"
## [45] "Misc.Feature" "Misc.Val" "Mo.Sold" "Yr.Sold"
## [49] "Sale.Type" "Sale.Condition"
The following interaction plots suggest that there could potentially be an interaction between (Log.area and Neighborhood.Binned), (Log.area and MS.SubClass.Binned), (Overall.Qual and Neighborhood.Binned) and (Overall.Qual and MS.SubClass.Binned).
i1 <- ggplot(aes(y=log(price), x=Log.area, color=Neighborhood.Binned, shape=Neighborhood.Binned), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=lm, se=FALSE, fullrange=TRUE)
i2 <- ggplot(aes(y=log(price), x=Log.area, color=MS.SubClass.Binned, shape=MS.SubClass.Binned), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=lm, se=FALSE, fullrange=TRUE)
i3 <- ggplot(aes(y=log(price), x=Overall.Qual, color=Neighborhood.Binned, shape=Neighborhood.Binned), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=lm, se=FALSE, fullrange=TRUE)
i4 <- ggplot(aes(y=log(price), x=Overall.Qual, color=MS.SubClass.Binned, shape=MS.SubClass.Binned), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=lm, se=FALSE, fullrange=TRUE)
grid.arrange(
i1, i2, i3, i4,
nrow = 2,
bottom = textGrob(
"",
gp = gpar(fontface = 3, fontsize = 9),
hjust = 1,
x = 1
)
)
We run simple linear regressions to evaluate if the interaction terms are indeed significant. Although the adjusted R squared reduces once the interaction terms are included (for all combinations), the statistical significance of the added interaction terms are still significantly smaller compared to the original variables (first order terms), so we decide against including these in our linear model.
wo.Log.area.Neighborhood.Binned <- lm(log(price) ~ Log.area + Neighborhood.Binned, data=ames_all[1:nrow(ames_train),])
w.Log.area.Neighborhood.Binned <- lm(log(price) ~ Log.area*Neighborhood.Binned, data=ames_all[1:nrow(ames_train),])
summary(wo.Log.area.Neighborhood.Binned)
##
## Call:
## lm(formula = log(price) ~ Log.area + Neighborhood.Binned, data = ames_all[1:nrow(ames_train),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.94180 -0.10576 0.01067 0.12203 0.85480
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.04516 0.16378 43.02 <2e-16 ***
## Log.area 0.64740 0.02307 28.06 <2e-16 ***
## Neighborhood.Binned1 0.24186 0.01852 13.06 <2e-16 ***
## Neighborhood.Binned2 0.34741 0.02097 16.57 <2e-16 ***
## Neighborhood.Binned3 0.60575 0.02266 26.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.216 on 995 degrees of freedom
## Multiple R-squared: 0.7374, Adjusted R-squared: 0.7363
## F-statistic: 698.4 on 4 and 995 DF, p-value: < 2.2e-16
summary(w.Log.area.Neighborhood.Binned)
##
## Call:
## lm(formula = log(price) ~ Log.area * Neighborhood.Binned, data = ames_all[1:nrow(ames_train),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.94160 -0.10314 0.00779 0.11598 0.85466
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.04111 0.27104 25.978 < 2e-16 ***
## Log.area 0.64797 0.03827 16.930 < 2e-16 ***
## Neighborhood.Binned1 1.18786 0.41589 2.856 0.00438 **
## Neighborhood.Binned2 0.50743 0.44620 1.137 0.25572
## Neighborhood.Binned3 -1.28155 0.50919 -2.517 0.01200 *
## Log.area:Neighborhood.Binned1 -0.13265 0.05847 -2.269 0.02349 *
## Log.area:Neighborhood.Binned2 -0.02182 0.06158 -0.354 0.72317
## Log.area:Neighborhood.Binned3 0.25188 0.06908 3.646 0.00028 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2133 on 992 degrees of freedom
## Multiple R-squared: 0.7447, Adjusted R-squared: 0.7429
## F-statistic: 413.3 on 7 and 992 DF, p-value: < 2.2e-16
wo.Overall.Qual.Neighborhood.Binned <- lm(log(price) ~ Log.area + Neighborhood.Binned, data=ames_all[1:nrow(ames_train),])
w.Overall.Qual.Neighborhood.Binned <- lm(log(price) ~ Log.area*Neighborhood.Binned, data=ames_all[1:nrow(ames_train),])
summary(wo.Overall.Qual.Neighborhood.Binned)
##
## Call:
## lm(formula = log(price) ~ Log.area + Neighborhood.Binned, data = ames_all[1:nrow(ames_train),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.94180 -0.10576 0.01067 0.12203 0.85480
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.04516 0.16378 43.02 <2e-16 ***
## Log.area 0.64740 0.02307 28.06 <2e-16 ***
## Neighborhood.Binned1 0.24186 0.01852 13.06 <2e-16 ***
## Neighborhood.Binned2 0.34741 0.02097 16.57 <2e-16 ***
## Neighborhood.Binned3 0.60575 0.02266 26.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.216 on 995 degrees of freedom
## Multiple R-squared: 0.7374, Adjusted R-squared: 0.7363
## F-statistic: 698.4 on 4 and 995 DF, p-value: < 2.2e-16
summary(w.Overall.Qual.Neighborhood.Binned)
##
## Call:
## lm(formula = log(price) ~ Log.area * Neighborhood.Binned, data = ames_all[1:nrow(ames_train),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.94160 -0.10314 0.00779 0.11598 0.85466
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.04111 0.27104 25.978 < 2e-16 ***
## Log.area 0.64797 0.03827 16.930 < 2e-16 ***
## Neighborhood.Binned1 1.18786 0.41589 2.856 0.00438 **
## Neighborhood.Binned2 0.50743 0.44620 1.137 0.25572
## Neighborhood.Binned3 -1.28155 0.50919 -2.517 0.01200 *
## Log.area:Neighborhood.Binned1 -0.13265 0.05847 -2.269 0.02349 *
## Log.area:Neighborhood.Binned2 -0.02182 0.06158 -0.354 0.72317
## Log.area:Neighborhood.Binned3 0.25188 0.06908 3.646 0.00028 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2133 on 992 degrees of freedom
## Multiple R-squared: 0.7447, Adjusted R-squared: 0.7429
## F-statistic: 413.3 on 7 and 992 DF, p-value: < 2.2e-16
wo.Log.area.MS.SubClass.Binned <- lm(log(price) ~ Log.area + MS.SubClass.Binned, data=ames_all[1:nrow(ames_train),])
w.Log.area.MS.SubClass.Binned <- lm(log(price) ~ Log.area*MS.SubClass.Binned, data=ames_all[1:nrow(ames_train),])
summary(wo.Log.area.MS.SubClass.Binned)
##
## Call:
## lm(formula = log(price) ~ Log.area + MS.SubClass.Binned, data = ames_all[1:nrow(ames_train),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.92814 -0.12669 0.00423 0.14758 0.75446
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.50352 0.18443 29.840 < 2e-16 ***
## Log.area 0.87464 0.02653 32.965 < 2e-16 ***
## MS.SubClass.Binned1 -0.07512 0.03475 -2.162 0.0309 *
## MS.SubClass.Binned2 0.27313 0.03082 8.862 < 2e-16 ***
## MS.SubClass.Binned3 0.24615 0.03429 7.178 1.38e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.248 on 995 degrees of freedom
## Multiple R-squared: 0.6536, Adjusted R-squared: 0.6522
## F-statistic: 469.3 on 4 and 995 DF, p-value: < 2.2e-16
summary(w.Log.area.MS.SubClass.Binned)
##
## Call:
## lm(formula = log(price) ~ Log.area * MS.SubClass.Binned, data = ames_all[1:nrow(ames_train),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97446 -0.12513 0.01265 0.14916 0.73702
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.62291 0.51556 14.786 < 2e-16 ***
## Log.area 0.56633 0.07490 7.561 9.08e-14 ***
## MS.SubClass.Binned1 -1.77796 0.71912 -2.472 0.0136 *
## MS.SubClass.Binned2 -3.12799 0.58387 -5.357 1.05e-07 ***
## MS.SubClass.Binned3 -0.87042 0.63298 -1.375 0.1694
## Log.area:MS.SubClass.Binned1 0.25095 0.10184 2.464 0.0139 *
## Log.area:MS.SubClass.Binned2 0.48732 0.08410 5.795 9.18e-09 ***
## Log.area:MS.SubClass.Binned3 0.17355 0.08968 1.935 0.0532 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2427 on 992 degrees of freedom
## Multiple R-squared: 0.6693, Adjusted R-squared: 0.6669
## F-statistic: 286.8 on 7 and 992 DF, p-value: < 2.2e-16
wo.Overall.Qual.MS.SubClass.Binned <- lm(log(price) ~ Log.area + MS.SubClass.Binned, data=ames_all[1:nrow(ames_train),])
w.Overall.Qual.MS.SubClass.Binned <- lm(log(price) ~ Log.area*MS.SubClass.Binned, data=ames_all[1:nrow(ames_train),])
summary(wo.Overall.Qual.MS.SubClass.Binned)
##
## Call:
## lm(formula = log(price) ~ Log.area + MS.SubClass.Binned, data = ames_all[1:nrow(ames_train),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.92814 -0.12669 0.00423 0.14758 0.75446
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.50352 0.18443 29.840 < 2e-16 ***
## Log.area 0.87464 0.02653 32.965 < 2e-16 ***
## MS.SubClass.Binned1 -0.07512 0.03475 -2.162 0.0309 *
## MS.SubClass.Binned2 0.27313 0.03082 8.862 < 2e-16 ***
## MS.SubClass.Binned3 0.24615 0.03429 7.178 1.38e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.248 on 995 degrees of freedom
## Multiple R-squared: 0.6536, Adjusted R-squared: 0.6522
## F-statistic: 469.3 on 4 and 995 DF, p-value: < 2.2e-16
summary(w.Overall.Qual.MS.SubClass.Binned)
##
## Call:
## lm(formula = log(price) ~ Log.area * MS.SubClass.Binned, data = ames_all[1:nrow(ames_train),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97446 -0.12513 0.01265 0.14916 0.73702
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.62291 0.51556 14.786 < 2e-16 ***
## Log.area 0.56633 0.07490 7.561 9.08e-14 ***
## MS.SubClass.Binned1 -1.77796 0.71912 -2.472 0.0136 *
## MS.SubClass.Binned2 -3.12799 0.58387 -5.357 1.05e-07 ***
## MS.SubClass.Binned3 -0.87042 0.63298 -1.375 0.1694
## Log.area:MS.SubClass.Binned1 0.25095 0.10184 2.464 0.0139 *
## Log.area:MS.SubClass.Binned2 0.48732 0.08410 5.795 9.18e-09 ***
## Log.area:MS.SubClass.Binned3 0.17355 0.08968 1.935 0.0532 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2427 on 992 degrees of freedom
## Multiple R-squared: 0.6693, Adjusted R-squared: 0.6669
## F-statistic: 286.8 on 7 and 992 DF, p-value: < 2.2e-16
We normalize (scale and center) all numerical attributes.
library(caret)
## Loading required package: lattice
preProc <- preProcess(ames_all[,-1], method=c("center", "scale"))
ames_all <- predict(preProc, ames_all)
We fit a simple multiple linear regression model on the training dataset and evaluate the Cook’s distance of the observation points.
outlier <- lm(log(price) ~ . , data=ames_all[1:nrow(ames_train),])
par(mfrow = c(2, 2))
plot(outlier)
From the result, we conclude that none of the points included qualifies as a highly influential leverage point. Therefore, we keep all the observations.
We take one hot encoding of the factor variables.
# Drop levels with no entries
ames_all <- droplevels(ames_all)
# One-hot encoding using model.matrix()
ames_all <- as.data.frame(model.matrix( ~ . -1, ames_all))
# Standarize weird column names
colnames(ames_all) <- make.names(colnames(ames_all))
# Drop perfectly collinear columns
drop_collinear <- rownames(alias(lm(log(price) ~ . , data=ames_all))$Complete)
ames_all <- ames_all %>% dplyr::select(-all_of(drop_collinear))
We drop variables with high VIF (>10) as they make the estimates of coefficients unstable.
# Calculate Variance Inflation Factor of attributes
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:PreProcess':
##
## ellipse
## The following object is masked from 'package:dplyr':
##
## recode
vif_df <- data.frame(car::vif(lm(log(price) ~ . , data=ames_all)))
names(vif_df)[1] <- "GVIF"
drop_high_vif <- rownames(vif_df %>% filter(GVIF>10) %>% dplyr::arrange(desc(GVIF)))
ames_all <- ames_all %>% dplyr::select(-all_of(drop_high_vif))
In theory, using a reduced dimension data set makes no difference for the outcome of the prediction. PCA only removes the multicollinearity between the fields. We reduce the dimension of the data set by implementing PCA of the continuous numerical variables and later use this data set to see if it improves the performance.
# Calculate Variance Inflation Factor of attributes
num_var <- c("Log.area", "Overall.Qual", "Total.Bsmt.SF", "Garage.Cars", "Year.Built", "Year.Remod.Add", "Full.Bath", "Fireplaces")
dim_reduction <- ames_all %>% dplyr::select(all_of(num_var))
pca <- prcomp(dim_reduction, center=TRUE, scale.=TRUE, rank. = 3)
summary(pca)
## Importance of first k=3 (out of 8) components:
## PC1 PC2 PC3
## Standard deviation 2.0028 1.0517 0.88204
## Proportion of Variance 0.5014 0.1383 0.09725
## Cumulative Proportion 0.5014 0.6397 0.73692
library(ggbiplot)
## Loading required package: scales
ggbiplot(pca)
## Warning in sweep(pcobj$x, 2, 1/(d * nobs.factor), FUN = "*"): STATS is longer
## than the extent of 'dim(x)[MARGIN]'
## Warning in sweep(v, 2, d^var.scale, FUN = "*"): STATS is longer than the extent
## of 'dim(x)[MARGIN]'
rest <- ames_all %>% dplyr::select(-all_of(num_var))
ames_pca <- merge(rest, pca$x, by=0, sort=FALSE) %>% dplyr::select(-Row.names)
# Split the all data frame back into train, test and validation
train <- ames_all[1:nrow(ames_train),]
test <- ames_all[nrow(ames_train)+1:nrow(ames_test),]
validation <- ames_all[nrow(ames_train)+nrow(ames_test)+1:nrow(ames_validation),]
train_pca <- ames_pca[1:nrow(ames_train),]
test_pca <- ames_pca[nrow(ames_train)+1:nrow(ames_test),]
validation_pca <- ames_pca[nrow(ames_train)+nrow(ames_test)+1:nrow(ames_validation),]
We run a multiple linear regression on the pre-processed data set including the interaction terms mentioned above. R is directly solving the normal equations to obtain the least square fit: \[\hat{\beta} = (X^{T}X)^{-1}X^{T}Y\]
# Run multiple linear regression on log(price) using the full model
linear_model <- lm(log(price) ~ . , data=train)
summary(linear_model)$adj.r.squared
## [1] 0.8758192
We run stepwise backward variables elimination using AIC and BIC to find the best model.
\[AIC = 2k - 2log(L)\] \[BIC = klog(n) - 2log(L)\]
# Model selection using AIC
start_time <- Sys.time()
linear_model.AIC <- stepAIC(linear_model, k=2, trace=0)
end_time <- Sys.time()
end_time - start_time
## Time difference of 2.346269 secs
start_time <- Sys.time()
linear_model.BIC <- stepAIC(linear_model, k=log(nrow(train)), trace=0)
end_time <- Sys.time()
end_time - start_time
## Time difference of 2.47857 secs
summary(linear_model.AIC)$adj.r.squared
## [1] 0.8772922
summary(linear_model.BIC)$adj.r.squared
## [1] 0.8763994
linear_model.AIC has a lower adjusted R squared than linear_model or linear_model.BIC.
We use linear_model.AIC (suggestion from the AIC based backward elimination) to analyze the residuals.
par(mfrow = c(2, 2))
plot(linear_model.AIC, which = c(1,2,3,4))
From the result, we notice that the leverage for some observations are one. This is due to the fact these observations takes values with only one member. We ignore this case. Next, from the residual plots, we see that the residual for observations 498, 906 and 979 are about 4-6.25 standard deviations away from the estimated mean. We acknowledge that these points are high leverage points, but since their Cook’s distance (scaler metric that indicates the strength of an influence of a single observation on the overall fit) fall well below 0.5, we decide against excluding these observations from our analysis. Besides these outliers, the residual plots suggest normality and homoscadasticity.
We calculate RMSE in its original unit ($) using AIC and BIC models.
# Train
# Full
predict.linear_model <- exp(predict(linear_model, train))
## Warning in predict.lm(linear_model, train): prediction from a rank-deficient fit
## may be misleading
resid.linear_model <- train$price - predict.linear_model
rmse.linear_model <- sqrt(mean(resid.linear_model^2))
cat('Linear model (Full) RMSE on train: ', rmse.linear_model, '\n')
## Linear model (Full) RMSE on train: 27751.15
# AIC
predict.linear_model.AIC <- exp(predict(linear_model.AIC, train))
resid.linear_model.AIC <- train$price - predict.linear_model.AIC
rmse.linear_model.AIC <- sqrt(mean(resid.linear_model.AIC^2))
cat('Linear model (AIC) RMSE on train: ', rmse.linear_model.AIC, '\n')
## Linear model (AIC) RMSE on train: 27960.23
# BIC
predict.linear_model.BIC <- exp(predict(linear_model.BIC, train))
resid.linear_model.BIC <- train$price - predict.linear_model.BIC
rmse.linear_model.BIC <- sqrt(mean(resid.linear_model.BIC^2))
cat('Linear model (BIC) RMSE on train: ', rmse.linear_model.BIC, '\n')
## Linear model (BIC) RMSE on train: 28189.23
p1 <- ggplot(aes(x=price, y=predict.linear_model), data=train) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("Full Model Train")
p2 <- ggplot(aes(x=price, y=predict.linear_model.AIC), data=train) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("AIC Model Train")
p3 <- ggplot(aes(x=price, y=predict.linear_model.BIC), data=train) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("BIC Model Train")
grid.arrange(
p1, p2, p3,
nrow = 1,
bottom = textGrob(
"",
gp = gpar(fontface = 3, fontsize = 9),
hjust = 1,
x = 1
)
)
# Test
# Full
predict.linear_model <- exp(predict(linear_model, test))
## Warning in predict.lm(linear_model, test): prediction from a rank-deficient fit
## may be misleading
resid.linear_model <- test$price - predict.linear_model
rmse.linear_model <- sqrt(mean(resid.linear_model^2))
cat('Linear model (Full) RMSE on test: ', rmse.linear_model, '\n')
## Linear model (Full) RMSE on test: 22412.14
# AIC
predict.linear_model.AIC <- exp(predict(linear_model.AIC, test))
resid.linear_model.AIC <- test$price - predict.linear_model.AIC
rmse.linear_model.AIC <- sqrt(mean(resid.linear_model.AIC^2))
cat('Linear model (AIC) RMSE on test: ', rmse.linear_model.AIC, '\n')
## Linear model (AIC) RMSE on test: 22572.78
# BIC
predict.linear_model.BIC <- exp(predict(linear_model.BIC, test))
resid.linear_model.BIC <- test$price - predict.linear_model.BIC
rmse.linear_model.BIC <- sqrt(mean(resid.linear_model.BIC^2))
cat('Linear model (BIC) RMSE on test: ', rmse.linear_model.BIC, '\n')
## Linear model (BIC) RMSE on test: 22624.94
p1 <- ggplot(aes(x=price, y=predict.linear_model), data=test) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("Full Model Test")
p2 <- ggplot(aes(x=price, y=predict.linear_model.AIC), data=test) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("AIC Model Test")
p3 <- ggplot(aes(x=price, y=predict.linear_model.BIC), data=test) + geom_jitter(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("BIC Model Test")
grid.arrange(
p1, p2, p3,
nrow = 1,
bottom = textGrob(
"",
gp = gpar(fontface = 3, fontsize = 9),
hjust = 1,
x = 1
)
)
The results suggest that the RMSE for predictions on a training data set for AIC is higher than BIC by about $xxx In the scatter plots showing the predicted value vs. actual value, AIC model seems to be fitting the data better especially at large values. Interestingly, the RMSE on the testing data set is significantly lower than on the training data set for both AIC and BIC models. This result implies:
We use cross validation for hyper parameter tuning (\(\lambda\)), which is the penalty factor for the L2 norm of the coefficients.
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1
set.seed(1)
x_train = as.matrix(train[,-1])
y_train = log(train$price)
x_test = as.matrix(test[,-1])
y_test = log(test$price)
# Returns sequence of models with different lambdas
lambdas <- 10^seq(3, -3, by = -.1)
ridge_model_glmridge = glmnet(x_train, y_train, alpha=0, lambda=lambdas)
cv_ridge <- cv.glmnet(x_train, y_train, alpha=0, lambda=lambdas)
plot(cv_ridge)
optimal_lambda_ridge <- cv_ridge$lambda.min
cat('Optimal lambda for ridge regression: ', optimal_lambda_ridge, '\n')
## Optimal lambda for ridge regression: 0.01258925
predict.ridge_model_train <- exp(predict(ridge_model_glmridge, s=optimal_lambda_ridge, x_train))
resid.ridge_model_train <- train$price - predict.ridge_model_train
rmse.ridge_model_train <- sqrt(mean(resid.ridge_model_train^2))
sse.train <- sum((predict.ridge_model_train - train$price)^2)
sst.train <- sum((train$price - mean(train$price))^2)
r_square.train <- 1 - sse.train/sst.train
cat('Ridge regression RMSE and R-square on train:\t', rmse.ridge_model_train, '\t', r_square.train, '\n')
## Ridge regression RMSE and R-square on train: 28024.2 0.8828265
p1 <- ggplot(aes(x=price, y=predict.ridge_model_train), data=train) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("Ridge Regression on Train Data")
predict.ridge_model_test <- exp(predict(ridge_model_glmridge, s=optimal_lambda_ridge, x_test))
resid.ridge_model_test <- test$price - predict.ridge_model_test
rmse.ridge_model_test <- sqrt(mean(resid.ridge_model_test^2))
sse.test <- sum((predict.ridge_model_test - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('Ridge regression RMSE and R-square on test:\t', rmse.ridge_model_test, '\t', r_square.test, '\n')
## Ridge regression RMSE and R-square on test: 22518.91 0.9041723
p2 <- ggplot(aes(x=price, y=predict.ridge_model_test), data=test) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("Ridge Regression on Test Data")
grid.arrange(
p1, p2,
nrow = 1,
bottom = textGrob(
"",
gp = gpar(fontface = 3, fontsize = 9),
hjust = 1,
x = 1
)
)
We use cross validation for hyper parameter tuning (\(\lambda\)), which is the penalty factor for the L1 norm of the coefficients.
set.seed(1)
lambdas <- 10^seq(2, -3, by=-0.1)
lasso_model_glmridge = glmnet(x_train, y_train, alpha=1, lambda=lambdas)
cv_lasso <- cv.glmnet(x_train, y_train, alpha=1, lambda=lambdas)
plot(cv_lasso)
optimal_lambda_lasso <- cv_lasso$lambda.min
cat('Optimal lambda for lasso regression:\t', optimal_lambda_lasso, '\n')
## Optimal lambda for lasso regression: 0.002511886
predict.lasso_model_train <- exp(predict(lasso_model_glmridge, s=optimal_lambda_lasso, x_train))
resid.lasso_model_train <- train$price - predict.lasso_model_train
rmse.lasso_model_train <- sqrt(mean(resid.lasso_model_train^2))
sse.train <- sum((predict.lasso_model_train - train$price)^2)
sst.train <- sum((train$price - mean(train$price))^2)
r_square.train <- 1 - sse.train/sst.train
cat('Lasso regression RMSE and R-square on train:\t', rmse.lasso_model_train, '\t', r_square.train, '\n')
## Lasso regression RMSE and R-square on train: 28517.89 0.8786617
p1 <- ggplot(aes(x=price, y=predict.lasso_model_train), data=train) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("Lasso Regression on Train Data")
predict.lasso_model_test <- exp(predict(lasso_model_glmridge, s=optimal_lambda_lasso, x_test))
resid.lasso_model_test <- test$price - predict.lasso_model_test
rmse.lasso_model_test <- sqrt(mean(resid.lasso_model_test^2))
sse.test <- sum((predict.lasso_model_test - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('Lasso regression RMSE and R-square on test:\t', rmse.lasso_model_test, '\t', r_square.test, '\n')
## Lasso regression RMSE and R-square on test: 22566.63 0.9037657
p2 <- ggplot(aes(x=price, y=predict.lasso_model_test), data=test) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("Lasso Regression on Test Data")
grid.arrange(
p1, p2,
nrow = 1,
bottom = textGrob(
"",
gp = gpar(fontface = 3, fontsize = 9),
hjust = 1,
x = 1
)
)
We carry out repeated 10-fold cross validation 5 times to find the optimal parameters (lambda and alpha). We use a randomized grid search.
set.seed(1)
# Set training control
train_cont <- trainControl(method="repeatedcv", number=10, repeats=5, search="random", verboseIter=FALSE)
# Train the model
elastic_model <- train(log(price) ~ ., data=train, method="glmnet", tuneLength=10, trControl=train_cont)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
# Best tuning parameter
elastic_model$bestTune
## alpha lambda
## 2 0.1765568 0.006605359
predict.elastic_model_train <- exp(predict(elastic_model, x_train))
resid.elastic_model_train <- train$price - predict.elastic_model_train
rmse.elastic_model_train <- sqrt(mean(resid.elastic_model_train^2))
sse.train <- sum((predict.elastic_model_train - train$price)^2)
sst.train <- sum((train$price - mean(train$price))^2)
r_square.train <- 1 - sse.train/sst.train
cat('\nElastic regression RMSE and R-square on train:\t', rmse.elastic_model_train, '\t', r_square.train, '\n')
##
## Elastic regression RMSE and R-square on train: 28221.66 0.8811694
p1 <- ggplot(aes(x=price, y=predict.elastic_model_train), data=train) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("Elastic Regression on Train Data")
predict.elastic_model_test <- exp(predict(elastic_model, x_test))
resid.elastic_model_test <- test$price - predict.elastic_model_test
rmse.elastic_model_test <- sqrt(mean(resid.elastic_model_test^2))
sse.test <- sum((predict.elastic_model_test - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('Elastic regression RMSE and R-square on test:\t', rmse.elastic_model_test, '\t', r_square.test, '\n')
## Elastic regression RMSE and R-square on test: 22495.49 0.9043716
p2 <- ggplot(aes(x=price, y=predict.elastic_model_test), data=test) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("Elastic Regression on Test Data")
grid.arrange(
p1, p2,
nrow = 1,
bottom = textGrob(
"",
gp = gpar(fontface = 3, fontsize = 9),
hjust = 1,
x = 1
)
)
Need to understand how the coefficients are updated by the observations and how the probabilities are calculated.
# Fit the model: Bayesian
bma <- bas.lm(log(price) ~ . , data=train, prior="BIC", modelprior=uniform())
# Print out the marginal posterior inclusion probability of each variable
bma
##
## Call:
## bas.lm(formula = log(price) ~ ., data = train, prior = "BIC",
## modelprior = uniform())
##
##
## Marginal Posterior Inclusion Probabilities:
## Intercept Overall.Qual Total.Bsmt.SF
## 1.00000 1.00000 1.00000
## Garage.Cars Year.Built Year.Remod.Add
## 0.96953 0.56340 0.99994
## Full.Bath Fireplaces BsmtFin.Type.1
## 0.04294 0.99993 0.02478
## Kitchen.QualFa Kitchen.QualGd Kitchen.QualPo
## 0.02887 0.14319 0.02371
## Kitchen.QualTA Garage.TypeBasment Garage.TypeBuiltIn
## 0.99591 0.02455 0.02328
## Garage.TypeCarPort Garage.TypeNone Bsmt.QualEx
## 0.02252 0.02826 0.99836
## Bsmt.QualFa Bsmt.QualGd Bsmt.QualPo
## 0.07259 0.02742 0.02234
## MS.ZoningI..all. Fireplace.QuFa Fireplace.QuPo
## 0.18955 0.02212 0.02256
## Exterior.1stAsphShn Exterior.1stBrkComm Exterior.1stBrkFace
## 0.49701 0.68095 0.47708
## Exterior.1stCBlock Exterior.1stCemntBd Exterior.1stImStucc
## 0.50194 0.02308 0.02269
## Exterior.1stPlywood Exterior.1stPreCast Exterior.1stStucco
## 0.02874 0.49879 0.02805
## Exterior.1stWdShing House.Style1.5Unf House.Style1Story
## 0.02688 0.02373 0.99992
## House.Style2.5Fin House.Style2.5Unf House.Style2Story
## 0.49597 0.07644 0.99809
## House.StyleSFoyer House.StyleSLvl Central.AirY
## 0.02709 0.03495 1.00000
## Log.area Neighborhood.Binned1 Neighborhood.Binned2
## 1.00000 1.00000 1.00000
## Neighborhood.Binned3 MS.SubClass.Binned1 MS.SubClass.Binned2
## 1.00000 0.95878 0.99629
## MS.SubClass.Binned3
## 0.10437
# Top 5 most probably models
summary(bma)
## P(B != 0 | Y) model 1 model 2 model 3 model 4
## Intercept 1.00000000 1.0000 1.0000 1.0000 1.0000
## Overall.Qual 1.00000000 1.0000 1.0000 1.0000 1.0000
## Total.Bsmt.SF 1.00000000 1.0000 1.0000 1.0000 1.0000
## Garage.Cars 0.96953302 1.0000 1.0000 1.0000 1.0000
## Year.Built 0.56340247 1.0000 1.0000 1.0000 1.0000
## Year.Remod.Add 0.99993873 1.0000 1.0000 1.0000 1.0000
## Full.Bath 0.04294058 0.0000 0.0000 0.0000 0.0000
## Fireplaces 0.99992630 1.0000 1.0000 1.0000 1.0000
## BsmtFin.Type.1 0.02477998 0.0000 0.0000 0.0000 0.0000
## Kitchen.QualFa 0.02887293 0.0000 0.0000 0.0000 0.0000
## Kitchen.QualGd 0.14319196 0.0000 0.0000 0.0000 0.0000
## Kitchen.QualPo 0.02370923 0.0000 0.0000 0.0000 0.0000
## Kitchen.QualTA 0.99590679 1.0000 1.0000 1.0000 1.0000
## Garage.TypeBasment 0.02454504 0.0000 0.0000 0.0000 0.0000
## Garage.TypeBuiltIn 0.02327786 0.0000 0.0000 0.0000 0.0000
## Garage.TypeCarPort 0.02252438 0.0000 0.0000 0.0000 0.0000
## Garage.TypeNone 0.02825739 0.0000 0.0000 0.0000 0.0000
## Bsmt.QualEx 0.99835654 1.0000 1.0000 1.0000 1.0000
## Bsmt.QualFa 0.07259172 0.0000 0.0000 0.0000 0.0000
## Bsmt.QualGd 0.02742037 0.0000 0.0000 0.0000 0.0000
## Bsmt.QualPo 0.02233905 0.0000 0.0000 0.0000 0.0000
## MS.ZoningI..all. 0.18955098 0.0000 0.0000 0.0000 0.0000
## Fireplace.QuFa 0.02211737 0.0000 0.0000 0.0000 0.0000
## Fireplace.QuPo 0.02256484 0.0000 0.0000 0.0000 0.0000
## Exterior.1stAsphShn 0.49701018 0.0000 0.0000 0.0000 1.0000
## Exterior.1stBrkComm 0.68095257 1.0000 1.0000 1.0000 1.0000
## Exterior.1stBrkFace 0.47708109 1.0000 1.0000 1.0000 1.0000
## Exterior.1stCBlock 0.50194005 1.0000 0.0000 1.0000 1.0000
## Exterior.1stCemntBd 0.02308044 0.0000 0.0000 0.0000 0.0000
## Exterior.1stImStucc 0.02268727 0.0000 0.0000 0.0000 0.0000
## Exterior.1stPlywood 0.02873601 0.0000 0.0000 0.0000 0.0000
## Exterior.1stPreCast 0.49878631 0.0000 0.0000 1.0000 1.0000
## Exterior.1stStucco 0.02804627 0.0000 0.0000 0.0000 0.0000
## Exterior.1stWdShing 0.02687561 0.0000 0.0000 0.0000 0.0000
## House.Style1.5Unf 0.02372789 0.0000 0.0000 0.0000 0.0000
## House.Style1Story 0.99991793 1.0000 1.0000 1.0000 1.0000
## House.Style2.5Fin 0.49596883 0.0000 0.0000 0.0000 0.0000
## House.Style2.5Unf 0.07644254 0.0000 0.0000 0.0000 0.0000
## House.Style2Story 0.99809045 1.0000 1.0000 1.0000 1.0000
## House.StyleSFoyer 0.02709384 0.0000 0.0000 0.0000 0.0000
## House.StyleSLvl 0.03494704 0.0000 0.0000 0.0000 0.0000
## Central.AirY 1.00000000 1.0000 1.0000 1.0000 1.0000
## Log.area 1.00000000 1.0000 1.0000 1.0000 1.0000
## Neighborhood.Binned1 0.99999814 1.0000 1.0000 1.0000 1.0000
## Neighborhood.Binned2 0.99999853 1.0000 1.0000 1.0000 1.0000
## Neighborhood.Binned3 1.00000000 1.0000 1.0000 1.0000 1.0000
## MS.SubClass.Binned1 0.95878285 1.0000 1.0000 1.0000 1.0000
## MS.SubClass.Binned2 0.99628982 1.0000 1.0000 1.0000 1.0000
## MS.SubClass.Binned3 0.10437164 0.0000 0.0000 0.0000 0.0000
## BF NA 1.0000 1.0000 1.0000 1.0000
## PostProbs NA 0.0031 0.0031 0.0031 0.0031
## R2 NA 0.8788 0.8788 0.8788 0.8788
## dim NA 21.0000 20.0000 22.0000 23.0000
## logmarg NA -1601.4311 -1601.4311 -1601.4311 -1601.4311
## model 5
## Intercept 1.0000
## Overall.Qual 1.0000
## Total.Bsmt.SF 1.0000
## Garage.Cars 1.0000
## Year.Built 1.0000
## Year.Remod.Add 1.0000
## Full.Bath 0.0000
## Fireplaces 1.0000
## BsmtFin.Type.1 0.0000
## Kitchen.QualFa 0.0000
## Kitchen.QualGd 0.0000
## Kitchen.QualPo 0.0000
## Kitchen.QualTA 1.0000
## Garage.TypeBasment 0.0000
## Garage.TypeBuiltIn 0.0000
## Garage.TypeCarPort 0.0000
## Garage.TypeNone 0.0000
## Bsmt.QualEx 1.0000
## Bsmt.QualFa 0.0000
## Bsmt.QualGd 0.0000
## Bsmt.QualPo 0.0000
## MS.ZoningI..all. 0.0000
## Fireplace.QuFa 0.0000
## Fireplace.QuPo 0.0000
## Exterior.1stAsphShn 1.0000
## Exterior.1stBrkComm 1.0000
## Exterior.1stBrkFace 1.0000
## Exterior.1stCBlock 1.0000
## Exterior.1stCemntBd 0.0000
## Exterior.1stImStucc 0.0000
## Exterior.1stPlywood 0.0000
## Exterior.1stPreCast 0.0000
## Exterior.1stStucco 0.0000
## Exterior.1stWdShing 0.0000
## House.Style1.5Unf 0.0000
## House.Style1Story 1.0000
## House.Style2.5Fin 0.0000
## House.Style2.5Unf 0.0000
## House.Style2Story 1.0000
## House.StyleSFoyer 0.0000
## House.StyleSLvl 0.0000
## Central.AirY 1.0000
## Log.area 1.0000
## Neighborhood.Binned1 1.0000
## Neighborhood.Binned2 1.0000
## Neighborhood.Binned3 1.0000
## MS.SubClass.Binned1 1.0000
## MS.SubClass.Binned2 1.0000
## MS.SubClass.Binned3 0.0000
## BF 1.0000
## PostProbs 0.0031
## R2 0.8788
## dim 22.0000
## logmarg -1601.4311
# RMSE on Training Data
# Highest Probability Model
pred.train.HPM <- predict(bma, newdata=train, estimator="HPM")
pred.HPM.rmse <- sqrt(mean((exp(pred.train.HPM$fit) - train$price)^2))
sse.train <- sum((exp(pred.train.HPM$fit) - train$price)^2)
sst.train <- sum((train$price - mean(train$price))^2)
r_square.train <- 1 - sse.train/sst.train
cat('Highest Probability Model (RMSE & R Square) :\t', pred.HPM.rmse, '\t', r_square.train, '\n')
## Highest Probability Model (RMSE & R Square) : 28189.23 0.8814424
# Median Probability Model
pred.train.MPM <- predict(bma, newdata=train, estimator="MPM")
pred.MPM.rmse <- sqrt(mean((exp(pred.train.MPM$fit) - train$price)^2))
sse.train <- sum((exp(pred.train.MPM$fit) - train$price)^2)
sst.train <- sum((train$price - mean(train$price))^2)
r_square.train <- 1 - sse.train/sst.train
cat('Median Probability Model (RMSE & R Square) :\t', pred.MPM.rmse, '\t', r_square.train, '\n')
## Median Probability Model (RMSE & R Square) : 28357.3 0.8800245
# Average over all the models (computation heavy)
#pred.train.BPM <- predict(bma, newdata=train, estimator='BPM')
#pred.BPM.rmse <- sqrt(mean((exp(pred.train.BPM$fit) - train$price)^2))
#sse.train <- sum((exp(pred.train.MPM$fit) - train$price)^2)
#sst.train <- sum((train$price - mean(train$price))^2)
#r_square.train <- 1 - sse.train/sst.train
#cat('Average over all the models:\t', pred.BPM.rmse, '\t', r_square.train)
# RMSE on Testing Data
# Highest Probability Model
pred.test.HPM <- predict(bma, newdata=test, estimator="HPM")
pred.HPM.rmse <- sqrt(mean((exp(pred.test.HPM$fit) - test$price)^2))
sse.test <- sum((exp(pred.test.HPM$fit) - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('Highest Probability Model (RMSE & R Square) :\t', pred.HPM.rmse, '\t', r_square.test, '\n')
## Highest Probability Model (RMSE & R Square) : 22624.94 0.9032678
# Median Probability Model
pred.test.MPM <- predict(bma, newdata=test, estimator="MPM")
pred.MPM.rmse <- sqrt(mean((exp(pred.test.MPM$fit) - test$price)^2))
sse.test <- sum((exp(pred.test.MPM$fit) - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('Median Probability Model (RMSE & R Square) :\t', pred.MPM.rmse, '\t', r_square.test, '\n')
## Median Probability Model (RMSE & R Square) : 22672.57 0.9028601
# Average over all the models
#pred.test.BPM <- predict(bma, newdata=test, estimator='BPM')
#pred.BPM.rmse <- sqrt(mean((exp(pred.test.BPM$fit) - test$price)^2))
#sse.test <- sum((exp(pred.test.MPM$fit) - test$price)^2)
#sst.test <- sum((test$price - mean(test$price))^2)
#r_square.test <- 1 - sse.test/sst.test
#cat('Average over all the models:\t', pred.BPM.rmse, '\t', r_square.test)
Usually with just one predictor due to the curse of dimensionality. As the number of variables increases, so does the computational cost quadratically. We find the optimal bandwidth using npregbw().
library(np)
## Nonparametric Kernel Methods for Mixed Datatypes (version 0.60-10)
## [vignette("np_faq",package="np") provides answers to frequently asked questions]
## [vignette("np",package="np") an overview]
## [vignette("entropy_np",package="np") an overview of entropy-based methods]
# https://bookdown.org/egarpor/NP-UC3M/kre-i-np.html
# The spinner can be omitted with
options(np.messages = FALSE)
# npregbw() computes by default the least squares CV bandwidth associated to a local *constant* fit
bw_lc <- npregbw(log(price) ~ Log.area + Overall.Qual + Total.Bsmt.SF, data=train)
bw_ll <- npregbw(log(price) ~ Log.area + Overall.Qual + Total.Bsmt.SF, data=train, regtype="ll")
# Locally constant Gaussian kernel regression on the training data
kre_lc_train <- npreg(bws=bw_lc)
# Locally linear Gaussian kernel regression on the training data
kre_ll_train <- npreg(bws=bw_ll)
# Visualization only good for one predictor. Otherwise, it could be a mess.
#plot(kre_lc, col=2, type = "o")
#points(train$Log.area, log(train$price))
#plot(kre_ll, col=2, type = "o")
#points(train$Log.area, log(train$price))
# Locally linear Gaussian kernel regression on the testing data
kre_ll_test <- npreg(bw_ll, newdata=test, y.eval=TRUE)
kre_lc_test <- npreg(bw_lc, newdata=test, y.eval=TRUE)
pred.kre_ll <- kre_ll_test$mean
rmse <- sqrt(mean((exp(pred.kre_ll) - test$price)^2))
sse.test <- sum((exp(pred.kre_ll) - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('Locally Linear Gaussian Kernel Regression (RMSE & R Square) :\t', rmse, '\t', r_square.test, '\n')
## Locally Linear Gaussian Kernel Regression (RMSE & R Square) : 33730.05 0.7850041
pred.kre_lc <- kre_lc_test$mean
rmse <- sqrt(mean((exp(pred.kre_lc) - test$price)^2))
sse.test <- sum((exp(pred.kre_lc) - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('Locally Constant Gaussian Kernel Regression (RMSE & R Square) :\t', rmse, '\t', r_square.test, '\n')
## Locally Constant Gaussian Kernel Regression (RMSE & R Square) : 33632.61 0.7862444
Pruning (stopping the tree to grow and avoiding splitting a partition if the split does not significantly improves the overall quality of the model) is performed by caret package, which invokes rpart method for automatically testing different values of cp. Then it chooses the optimal cp that maximize the cross-validation accuracy and fits the final best CART model that explains the best the data. tuneLength is for specifying the number of possible cp values to evaluate. Default value is 3, here we’ll use 10.
library(rpart)
library(rpart.plot)
regression_tree <- train(log(price) ~ . , data=train, method="rpart", trControl=trainControl("cv", number=10), tuneLength=10 )
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
# Plot model error vs different values of complexity parameter
plot(regression_tree)
# Print the best tuning parameter cp that minimize the model RMSE
regression_tree$bestTune
## cp
## 1 0.01266821
# Plot the final tree model
rpart.plot(regression_tree$finalModel)
# Decision rules in the model
regression_tree$finalModel
## n= 1000
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 1000 176.727500 12.01847
## 2) Overall.Qual< 0.3318801 629 60.414460 11.80213
## 4) Overall.Qual< -1.130546 86 15.652640 11.40766
## 8) Total.Bsmt.SF< -0.6260206 48 7.877014 11.19275
## 16) Overall.Qual< -2.592973 9 2.355814 10.71026 *
## 17) Overall.Qual>=-2.592973 39 2.942480 11.30410 *
## 9) Total.Bsmt.SF>=-0.6260206 38 2.758337 11.67913 *
## 5) Overall.Qual>=-1.130546 543 29.260410 11.86461
## 10) Garage.Cars< -0.3339475 261 9.425580 11.73711 *
## 11) Garage.Cars>=-0.3339475 282 11.665930 11.98260
## 22) Full.Bath< -0.09302467 134 2.999426 11.87981 *
## 23) Full.Bath>=-0.09302467 148 5.968635 12.07567 *
## 3) Overall.Qual>=0.3318801 371 36.962170 12.38526
## 6) Garage.Cars< 1.018618 245 12.165020 12.24190
## 12) Total.Bsmt.SF< 1.085312 204 8.135974 12.19347 *
## 13) Total.Bsmt.SF>=1.085312 41 1.169294 12.48290 *
## 7) Garage.Cars>=1.018618 126 9.972419 12.66400
## 14) Total.Bsmt.SF< 1.456459 70 3.599219 12.51377 *
## 15) Total.Bsmt.SF>=1.456459 56 2.818360 12.85179 *
# Train
best_tree <- regression_tree$finalModel
predictions.train <- predict(best_tree, train)
rmse.train <- sqrt(mean((exp(predictions.train) - train$price)^2))
sse.train <- sum((exp(predictions.train) - train$price)^2)
sst.train <- sum((train$price - mean(train$price))^2)
r_square.train <- 1 - sse.train/sst.train
cat('Regression Tree on Train (RMSE & R Square) :\t', rmse.train, '\t', r_square.train, '\n')
## Regression Tree on Train (RMSE & R Square) : 40227.7 0.7585577
# Test
predictions.test <- predict(best_tree, test)
rmse.test <- sqrt(mean((exp(predictions.test) - test$price)^2))
sse.test <- sum((exp(predictions.test) - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('Regression Tree on test (RMSE & R Square) :\t', rmse.test, '\t', r_square.test, '\n')
## Regression Tree on test (RMSE & R Square) : 41332.31 0.6771687
Here, we will use 5-fold cross validation 5 times (repeated CV). mtry is the number of variables randomly sampled as candidates at each split.
library(randomForest)
# Set up a 5-fold cross validation
#control <- trainControl(method="cv", number=5)
# Set up a 5-fold cross validation 5 times (repeated CV)
control <- trainControl(method="repeatedcv", number=5, repeats=5)
random_forest_model <- train(log(price) ~ . , data=train, method="rf", trControl=control)
# Best Model
random_forest_model$bestTune
## mtry
## 2 25
# Train
best_forest <- random_forest_model$finalModel
predictions.train <- predict(best_forest, train)
rmse.train <- sqrt(mean((exp(predictions.train) - train$price)^2))
sse.train <- sum((exp(predictions.train) - train$price)^2)
sst.train <- sum((train$price - mean(train$price))^2)
r_square.train <- 1 - sse.train/sst.train
cat('Regression Tree on Train (RMSE & R Square) :\t', rmse.train, '\t', r_square.train, '\n')
## Regression Tree on Train (RMSE & R Square) : 13599.28 0.9724073
# Test
predictions.test <- predict(best_forest, test)
rmse.test <- sqrt(mean((exp(predictions.test) - test$price)^2))
sse.test <- sum((exp(predictions.test) - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('Regression Tree on test (RMSE & R Square) :\t', rmse.test, '\t', r_square.test, '\n')
## Regression Tree on test (RMSE & R Square) : 27335.35 0.8587965
We use caret to run a grid search to find the optimal hyper parameters (eta, max_depth, min_child_weight). Note that this will take some time since we are not running xgboost directly using the library with the efficient data type (xgb.DMatrix).
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
xgb_grid = expand.grid(
nrounds = 1000,
eta = c(0.1, 0.05, 0.01),
max_depth = c(2, 3, 4, 5, 6),
gamma = 0,
colsample_bytree=1,
#min_child_weight=c(1, 2, 3, 4 ,5),
min_child_weight=c(4),
subsample=1
)
Let caret find the best hyperparameter values (using 5 fold cross validation).
start_time <- Sys.time()
control <-trainControl(method="cv", number=5)
#xgb_caret <- train(log(price) ~ . , data=train, method='xgbTree', trControl=control, tuneGrid=xgb_grid)
#xgb_caret$bestTune
end_time <- Sys.time()
end_time - start_time
## Time difference of 0.001997948 secs
# Time difference of 20.82481 secs for one set of parameters
The optimal parameters are Max_depth=3, eta=0.05, Min_child_weight=4.
default_param<-list(
objective = "reg:squarederror",
booster = "gbtree",
eta=0.05, #default = 0.3
gamma=0,
max_depth=3, #default=6
min_child_weight=4, #default=1
subsample=1,
colsample_bytree=1
)
# put our testing & training data into two separate Dmatrixs objects
dtrain <- xgb.DMatrix(data=x_train, label=y_train)
dtest <- xgb.DMatrix(data=x_test, label=y_test)
We do a cross validation on the number of rounds using the method xgb.cv from xgboost.
xgbcv <- xgb.cv(params=default_param, data=dtrain, nrounds=500, nfold=5, showsd=T, stratified=T, print_every_n=40, early_stopping_rounds=10, maximize=F)
## [1] train-rmse:10.951333+0.006891 test-rmse:10.951263+0.029457
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [41] train-rmse:1.433526+0.001023 test-rmse:1.435024+0.016870
## [81] train-rmse:0.229844+0.001913 test-rmse:0.248510+0.016353
## [121] train-rmse:0.123191+0.002988 test-rmse:0.160469+0.022712
## [161] train-rmse:0.114077+0.002535 test-rmse:0.157740+0.023954
## [201] train-rmse:0.108690+0.002335 test-rmse:0.157171+0.024501
## [241] train-rmse:0.104254+0.002650 test-rmse:0.156626+0.024581
## Stopping. Best iteration:
## [244] train-rmse:0.103846+0.002685 test-rmse:0.156606+0.024545
#train the model using the best iteration found by cross validation
xgb_model <- xgb.train(data=dtrain, params=default_param, nrounds=254)
# Train
predictions.train <- predict(xgb_model, dtrain)
rmse.train <- sqrt(mean((exp(predictions.train) - train$price)^2))
sse.train <- sum((exp(predictions.train) - train$price)^2)
sst.train <- sum((train$price - mean(train$price))^2)
r_square.train <- 1 - sse.train/sst.train
cat('XGB on Train (RMSE & R Square) :\t', rmse.train, '\t', r_square.train, '\n')
## XGB on Train (RMSE & R Square) : 20013.28 0.9402415
# Test
predictions.test <- predict(xgb_model, dtest)
rmse.test <- sqrt(mean((exp(predictions.test) - test$price)^2))
sse.test <- sum((exp(predictions.test) - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('XGB on test (RMSE & R Square) :\t\t', rmse.test, '\t', r_square.test, '\n')
## XGB on test (RMSE & R Square) : 26328.89 0.869003
#view variable importance plot
library(Ckmeans.1d.dp) #required for ggplot clustering
mat <- xgb.importance(feature_names=colnames(x_train), model=xgb_model)
xgb.ggplot.importance(importance_matrix=mat[1:20], rel_to_first=TRUE)
We use caret’s method nnet to run a 5-fold cross validation to find the best number of nodes and weight decay factor.
library(neuralnet)
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
#http://sebastianderi.com/aexam/hld_MODEL_neural.html
set.seed(1)
# Step 1: SELECT TUNING PARAMETERS
# Set range of tuning parameters (layer size [number of nodes] and weight decay)
tune_grid_neural <- expand.grid(size=c(1:5), decay=c(0, 0.05, 0.1, 0.5, 1))
# Set other constrains to be imposed on network (to keep computation manageable)
max_size_neaural <- max(tune_grid_neural$size)
max_weights_neural <- max_size_neaural*(nrow(train) + 1) + max_size_neaural + 1
# Step 2: SELECT TUNING METHOD
# set up train control object, which specifies training/testing technique
control_neural <- trainControl(method="cv", number=5)
# Step 3: TRAIN MODEL
nn <- train(log(price) ~ . , data=train, method="nnet", linout=TRUE, tuneGrid=tune_grid_neural, trControl=control_neural, trace=FALSE, maxit=100)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
nn_model <- nn$finalModel
# Train
predictions.train <- predict(nn_model, train)
rmse.train <- sqrt(mean((exp(predictions.train) - train$price)^2))
sse.train <- sum((exp(predictions.train) - train$price)^2)
sst.train <- sum((train$price - mean(train$price))^2)
r_square.train <- 1 - sse.train/sst.train
cat('NN on Train (RMSE & R Square) :\t', rmse.train, '\t', r_square.train, '\n')
## NN on Train (RMSE & R Square) : 27312.29 0.8887041
# Test
predictions.test <- predict(nn_model, test)
rmse.test <- sqrt(mean((exp(predictions.test) - test$price)^2))
sse.test <- sum((exp(predictions.test) - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('NN on test (RMSE & R Square) :\t', rmse.test, '\t', r_square.test, '\n')
## NN on test (RMSE & R Square) : 24324.22 0.8881917
eval_metrics = function(model, validation){
predictions <- exp(predict(model, validation))
residuals <- validation$price - predictions
rmse <- sqrt(mean(residuals^2))
adj_r2 <- summary(model)$adj.r.squared
cat(rmse, '\t', adj_r2, '\n')
}
eval_metrics_gml = function(model, validation, optimal_lambda){
x_validate = as.matrix(validation[,-1])
y_validate = log(validation$price)
predictions <- exp(predict(model, s=optimal_lambda, x_validate))
residuals <- validation$price - predictions
rmse <- sqrt(mean(residuals^2))
sse <- sum((predictions - validation$price)^2)
sst <- sum((validation$price - mean(validation$price))^2)
r_square <- 1 - sse/sst
cat(rmse, '\t', r_square, '\n')
}
eval_metrics_elastic_gml = function(model, validation){
x_validate = as.matrix(validation[,-1])
y_validate = log(validation$price)
predictions <- exp(predict(model, x_validate))
residuals <- validation$price - predictions
rmse <- sqrt(mean(residuals^2))
sse <- sum((predictions - validation$price)^2)
sst <- sum((validation$price - mean(validation$price))^2)
r_square <- 1 - sse/sst
cat(rmse, '\t', r_square, '\n')
}
eval_metrics_bma = function(model, validation){
predictions <- predict(bma, newdata=validation, estimator="HPM")
rmse <- sqrt(mean((exp(predictions$fit) - validation$price)^2))
sse <- sum((exp(predictions$fit) - validation$price)^2)
sst <- sum((validation$price - mean(validation$price))^2)
r_square <- 1 - sse/sst
cat(rmse, '\t', r_square, '\n')
}
eval_metrics_kernel = function(bw, validation){
# Removing outlier... Non-paramteric models are not good for extrapolation
validation <- validation %>% filter(price!=284000 & Overall.Qual!=1.4286999 & Total.Bsmt.SF!=5.1786677591 & Log.area!=0.444833205)
kre <- npreg(bw, newdata=validation, y.eval=TRUE)
predictions <- kre$mean
rmse <- sqrt(mean((exp(predictions) - validation$price)^2))
sse <- sum((exp(predictions) - validation$price)^2)
sst <- sum((test$price - mean(validation$price))^2)
r_square <- 1-sse/sst
cat(rmse, '\t', r_square, '\n')
}
eval_metrics_others = function(model, validation){
predictions <- predict(model, validation)
rmse <- sqrt(mean((exp(predictions) - validation$price)^2))
sse <- sum((exp(predictions) - validation$price)^2)
sst <- sum((validation$price - mean(validation$price))^2)
r_square <- 1 - sse/sst
cat(rmse, '\t', r_square, '\n')
}
eval_metrics_xgb = function(model, validation){
x_validate = as.matrix(validation[,-1])
y_validate = log(validation$price)
dval <- xgb.DMatrix(data=x_validate, label=y_validate)
predictions <- predict(model, dval)
rmse <- sqrt(mean((exp(predictions) - validation$price)^2))
sse <- sum((exp(predictions) - validation$price)^2)
sst <- sum((validation$price - mean(validation$price))^2)
r_square <- 1 - sse/sst
cat(rmse, '\t', r_square, '\n')
}
cat('RMSE and Adjusted R Square on Validation Data\n\n')
## RMSE and Adjusted R Square on Validation Data
cat('linear full:\t\t')
## linear full:
eval_metrics(linear_model, validation)
## 23069.92 0.8758192
cat('linear AIC:\t\t')
## linear AIC:
eval_metrics(linear_model.AIC, validation)
## 23208.37 0.8772922
cat('linear BIC:\t\t')
## linear BIC:
eval_metrics(linear_model.BIC, validation)
## 23616.35 0.8763994
cat('ridge:\t\t\t')
## ridge:
eval_metrics_gml(ridge_model_glmridge, validation, optimal_lambda_lasso)
## 23073.38 0.8824473
cat('lasso:\t\t\t')
## lasso:
eval_metrics_gml(lasso_model_glmridge, validation, optimal_lambda_ridge)
## 23669.87 0.8762908
cat('elastic net:\t\t')
## elastic net:
eval_metrics_elastic_gml(elastic_model, validation)
## 23084.77 0.8823313
cat('baseysian linear:\t')
## baseysian linear:
eval_metrics_bma(bma, validation)
## 23616.35 0.8768497
cat('kernel regression:\t')
## kernel regression:
eval_metrics_kernel(bw_ll, validation)
## 26203.26 0.8802653
cat('regression tree:\t')
## regression tree:
eval_metrics_others(best_tree, validation)
## 37963.24 0.6817734
cat('random forest:\t\t')
## random forest:
eval_metrics_others(best_forest, validation)
## 24319.89 0.869403
cat('xgboost model:\t\t')
## xgboost model:
eval_metrics_xgb(xgb_model, validation)
## 22949.22 0.883709
cat('nn model:\t\t')
## nn model:
eval_metrics_others(nn_model, validation)
## 22975.79 0.8834396
The resulting RMSE is smaller compared to the RMSEs from both the training and the testing data.
# Calculate proportion of observations that fall within prediction intervals
interval.linear_model.AIC <- exp(predict(linear_model.AIC, validation, interval="prediction"))
coverage.prob.linear_model.AIC <- mean(validation$price > interval.linear_model.AIC[,"lwr"] & validation$price < interval.linear_model.AIC[,"upr"])
coverage.prob.linear_model.AIC
## [1] 0.9633028
Finally, 97% of the observations contain the true price of the house within their 95% predictive confidence (or credible) intervals. From this result, we can state that our final model properly reflects uncertainty.
We studied the housing price data set in detail in the EDA section. We found that there is a strong multicolinearity between many variables (both numerical and categorical). Strong correlations and statistical tests for significance indicate that attributes like Overall.Qual, area and Total.Bsmt.SF make the best predictors for the response variable price. These attributes all have a positive association with the response variable. Our final linear regression achieved RMSE-score of 21872.24 and the coverage percentage of 97% (i.e. percentage of observations containing the true price within their 95% predictive confidence (or credible) intervals.
Referring back to the original purpose of this study, which is to identify the houses that are underpriced in the market, those house could be identified as any house under the blue line in the plot AIC Model Validation (see above).